notebook.community

Edit and run



In [11]:

    
%matplotlib inline
import matplotlib.pyplot as plt



In [9]:

    
xlow = 0
xhigh = np.pi/2
x = np.linspace(xlow, xhigh, num = 100)
y = np.cos(x)



In [10]:

    
plt.figure()
plt.plot(x, y)
plt.show()



In [18]:

    
dim = 100
x_mcmc = np.random.random(dim) * (xhigh - xlow) + xlow
y_mcmc = np.cos(x_mcmc)



In [19]:

    
plt.figure()
plt.plot(x_mcmc, y_mcmc, 'o')
plt.show()



In [21]:

    
y_mcmc.mean() * (xhigh - xlow)









    Out[21]:





0.99904262914854747



In [ ]:

    
(xhigh - xlow) * np.sqrt(y_mcmc.



In [1]:

    
from numpy.random import random

Metropolis-Hastings Algorithm

the following code is from http://code.activestate.com/recipes/414200-metropolis-hastings-sampler/



In [23]:

    
def sdnorm(z):
    """
    Standard normal pdf (Probability Density Function)
    """
    return np.exp(-z*z/2.)/np.sqrt(2*np.pi)

n = 10000
alpha = 1
x = 0.
vec = []
vec.append(x)
innov = random(n) * 2 * alpha - alpha #random inovation, uniform proposal distribution
for i in xrange(1,n):
    can = x + innov[i] #candidate
    aprob = np.min([1.,sdnorm(can)/sdnorm(x)]) #acceptance probability
    u = random(1)
    if u[0] < aprob:
        x = can
        vec.append(x)



In [24]:

    
len(vec)









    Out[24]:





8066



In [25]:

    
#plotting the results:
#theoretical curve
x = np.arange(-3,3,.1)
y = sdnorm(x)
plt.subplot(211)
plt.title('Metropolis-Hastings')
plt.plot(vec)
plt.subplot(212)

plt.hist(vec, bins=30,normed=1)
plt.plot(x,y,'ro')
plt.ylabel('Frequency')
plt.xlabel('x')
plt.legend(('PDF','Samples'))
plt.show()

my version of the code



In [41]:

    
def metropolis_hastings(f, init = 0, nsamples = 1000):
    alpha = 1
    sample_count = 0
    x = init
    vec = []
    vec.append(x)
    while sample_count < nsamples-1:
        can = x + random(1)[0] * 2 * alpha - alpha #candidate
        aprob = np.min([1.,sdnorm(can)/sdnorm(x)]) #acceptance probability
        u = random(1)
        if u[0] < aprob:
            x = can
            vec.append(x)
            sample_count+=1
    return np.array(vec)



In [42]:

    
vec = metropolis_hastings(sdnorm, nsamples=10000)



In [43]:

    
len(vec)









    Out[43]:





10000



In [44]:

    
#plotting the results:
#theoretical curve
x = np.arange(-3,3,.1)
y = sdnorm(x)
plt.subplot(211)
plt.title('Metropolis-Hastings')
plt.plot(vec)
plt.subplot(212)

plt.hist(vec, bins=30,normed=1)
plt.plot(x,y,'ro')
plt.ylabel('Frequency')
plt.xlabel('x')
plt.legend(('PDF','Samples'))
plt.show()



In [45]:

    
def gauss(z):
    """
    Standard normal gaussian
    """
    return np.exp(-z*z/2.)



In [48]:

    
import scipy.integrate as integrate



In [50]:

    
integrate.quad(gauss, -np.Inf, np.Inf)









    Out[50]:





(2.5066282746309994, 2.5512942192316024e-08)



In [51]:

    
np.sqrt(2*np.pi)









    Out[51]:





2.5066282746310002



In [167]:

    
def triangle(z):
    if (type(z) is not type([1])) and (type(z) is not type(np.array(1))):
        x = np.array([z])
    else:
        x = np.array(z)
    return np.piecewise(x, [x < 0, (x < 10) & (x > 0), (x >= 10) & (x <= 20), x > 20], [0, lambda x: x, lambda x: - x + 20, 0])



In [168]:

    
triangle([1])









    Out[168]:





array([1])



In [169]:

    
x = np.arange(-100,100,.1)
plt.figure()
plt.plot(x, triangle(x))
plt.show()



In [170]:

    
type(np.array([1]))









    Out[170]:





numpy.ndarray



In [174]:

    
integrate.quad(triangle, -np.Inf, np.Inf)









    Out[174]:





(100.00000004456021, 8.648601692584634e-07)



In [178]:

    
vec = metropolis_hastings(lambda f: 1/100*triangle(x), nsamples=10000)



In [181]:

    
x = np.arange(0,100,.1)
y = 1/100 * triangle(x)
plt.subplot(211)
plt.title('Metropolis-Hastings')
plt.plot(vec)
plt.subplot(212)

plt.hist(vec, bins=30,normed=1)
plt.plot(x,y,'ro')
plt.ylabel('Frequency')
plt.xlabel('x')
plt.xlim((-30,30))
plt.legend(('PDF','Samples'))
plt.show()



In [184]:

    
import pymc



In [185]:

    
@pymc.stochastic(dtype=int)
def switchpoint(value=1900, t_l=1851, t_h=1962):
    """The switchpoint for the rate of disaster occurrence."""
    if value > t_h or value < t_l:
        # Invalid values
        return -np.inf
    else:
        # Uniform log-likelihood
        return -np.log(t_h - t_l + 1)



In [189]:

    
from scipy.stats import rv_discrete
from scipy import stats
normdiscrete = stats.rv_discrete(values=(gridint, np.round(probs, decimals=7)), name='normdiscrete')
n_sample = 500
np.random.seed(87655678)   # fix the seed for replicability
rvs = normdiscrete.rvs(size=n_sample)
rvsnd = rvs
f, l = np.histogram(rvs, bins=gridlimits)
sfreq = np.vstack([gridint, f, probs*n_sample]).T
print sfreq









    



---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-189-b50054185a3f> in <module>()
      1 from scipy.stats import rv_discrete
      2 from scipy import stats
----> 3 normdiscrete = stats.rv_discrete(values=(gridint, np.round(probs, decimals=7)), name='normdiscrete')
      4 n_sample = 500
      5 np.random.seed(87655678)   # fix the seed for replicability

NameError: name 'gridint' is not defined



In [219]:

    
from scipy import stats
class deterministic_gen(stats.rv_continuous):
     def _pdf(self, x):
        return 1/x



In [220]:

    
deterministic = deterministic_gen(name="deterministic")



In [222]:

    
deterministic.cdf(np.arange(1, 3, 0.5))









    



---------------------------------------------------------------------------
ZeroDivisionError                         Traceback (most recent call last)
<ipython-input-222-48cd819759f9> in <module>()
----> 1 deterministic.cdf(np.arange(1, 3, 0.5))

/Users/schriste/anaconda/lib/python2.7/site-packages/scipy/stats/distributions.pyc in cdf(self, x, *args, **kwds)
   1342         if any(cond):  # call only if at least 1 entry
   1343             goodargs = argsreduce(cond, *((x,)+args))
-> 1344             place(output,cond,self._cdf(*goodargs))
   1345         if output.ndim == 0:
   1346             return output[()]

/Users/schriste/anaconda/lib/python2.7/site-packages/scipy/stats/distributions.pyc in _cdf(self, x, *args)
   1199 
   1200     def _cdf(self, x, *args):
-> 1201         return self.veccdf(x,*args)
   1202 
   1203     def _logcdf(self, x, *args):

/Users/schriste/anaconda/lib/python2.7/site-packages/numpy/lib/function_base.pyc in __call__(self, *args, **kwargs)
   1570             vargs.extend([kwargs[_n] for _n in names])
   1571 
-> 1572         return self._vectorize_call(func=func, args=vargs)
   1573 
   1574     def _get_ufunc_and_otypes(self, func, args):

/Users/schriste/anaconda/lib/python2.7/site-packages/numpy/lib/function_base.pyc in _vectorize_call(self, func, args)
   1636                       for _a in args]
   1637 
-> 1638             outputs = ufunc(*inputs)
   1639 
   1640             if ufunc.nout == 1:

/Users/schriste/anaconda/lib/python2.7/site-packages/scipy/stats/distributions.pyc in _cdf_single_call(self, x, *args)
   1196 
   1197     def _cdf_single_call(self, x, *args):
-> 1198         return integrate.quad(self._pdf, self.a, x, args=args)[0]
   1199 
   1200     def _cdf(self, x, *args):

/Users/schriste/anaconda/lib/python2.7/site-packages/scipy/integrate/quadpack.pyc in quad(func, a, b, args, full_output, epsabs, epsrel, limit, points, weight, wvar, wopts, maxp1, limlst)
    252         args = (args,)
    253     if (weight is None):
--> 254         retval = _quad(func,a,b,args,full_output,epsabs,epsrel,limit,points)
    255     else:
    256         retval = _quad_weight(func,a,b,args,full_output,epsabs,epsrel,limlst,limit,maxp1,weight,wvar,wopts)

/Users/schriste/anaconda/lib/python2.7/site-packages/scipy/integrate/quadpack.pyc in _quad(func, a, b, args, full_output, epsabs, epsrel, limit, points)
    319             return _quadpack._qagse(func,a,b,args,full_output,epsabs,epsrel,limit)
    320         else:
--> 321             return _quadpack._qagie(func,bound,infbounds,args,full_output,epsabs,epsrel,limit)
    322     else:
    323         if infbounds != 0:

<ipython-input-219-9d04ce8ef423> in _pdf(self, x)
      2 class deterministic_gen(stats.rv_continuous):
      3      def _pdf(self, x):
----> 4         return 1/x

ZeroDivisionError: float division by zero



In [223]:

    
deterministic.pdf(np.arange(-3, 3, 0.5))









    



-c:4: RuntimeWarning: divide by zero encountered in divide






    Out[223]:





array([-0.33333333, -0.4       , -0.5       , -0.66666667, -1.        ,
       -2.        ,         inf,  2.        ,  1.        ,  0.66666667,
        0.5       ,  0.4       ])



In [224]:

    
deterministic.rvs(size=10)









    



---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
<ipython-input-224-eafdfe3a1ee3> in <module>()
----> 1 deterministic.rvs(size=10)

/Users/schriste/anaconda/lib/python2.7/site-packages/scipy/stats/distributions.pyc in rvs(self, *args, **kwds)
    669             return loc*ones(size, 'd')
    670 
--> 671         vals = self._rvs(*args)
    672         if self._size is not None:
    673             vals = reshape(vals, size)

/Users/schriste/anaconda/lib/python2.7/site-packages/scipy/stats/distributions.pyc in _rvs(self, *args)
   1192         ## Use basic inverse cdf algorithm for RV generation as default.
   1193         U = mtrand.sample(self._size)
-> 1194         Y = self._ppf(U,*args)
   1195         return Y
   1196 

/Users/schriste/anaconda/lib/python2.7/site-packages/scipy/stats/distributions.pyc in _ppf(self, q, *args)
   1211 
   1212     def _ppf(self, q, *args):
-> 1213         return self.vecfunc(q,*args)
   1214 
   1215     def _isf(self, q, *args):

/Users/schriste/anaconda/lib/python2.7/site-packages/numpy/lib/function_base.pyc in __call__(self, *args, **kwargs)
   1570             vargs.extend([kwargs[_n] for _n in names])
   1571 
-> 1572         return self._vectorize_call(func=func, args=vargs)
   1573 
   1574     def _get_ufunc_and_otypes(self, func, args):

/Users/schriste/anaconda/lib/python2.7/site-packages/numpy/lib/function_base.pyc in _vectorize_call(self, func, args)
   1636                       for _a in args]
   1637 
-> 1638             outputs = ufunc(*inputs)
   1639 
   1640             if ufunc.nout == 1:

/Users/schriste/anaconda/lib/python2.7/site-packages/scipy/stats/distributions.pyc in _ppf_single_call(self, q, *args)
   1154 
   1155         return optimize.brentq(self._ppf_to_solve,
-> 1156                                left, right, args=(q,)+args, xtol=self.xtol)
   1157 
   1158     # moment from definition

/Users/schriste/anaconda/lib/python2.7/site-packages/scipy/optimize/zeros.pyc in brentq(f, a, b, args, xtol, rtol, maxiter, full_output, disp)
    413     if rtol < _rtol:
    414         raise ValueError("rtol too small (%g < %g)" % (rtol, _rtol))
--> 415     r = _zeros._brentq(f,a,b,xtol,rtol,maxiter,args,full_output,disp)
    416     return results_c(full_output, r)
    417 

RuntimeError: Failed to converge after 100 iterations.



In [195]:

    
deterministic?



In [239]:

    
def getDistribution(data):
    kernel = stats.gaussian_kde(data)
    class rv(stats.rv_continuous):
        def _rvs(self, *x, **y):
            # don't ask me why it's using self._size 
            # nor why I have to cast to int
            return kernel.resample(int(self._size)) 
        def _cdf(self, x):
            return kernel.integrate_box_1d(-numpy.Inf, x)
        def _pdf(self, x):
            return kernel.evaluate(x)
    return rv(name='kdedist')



In [240]:

    
f = getDistribution(np.random.randn(100))



In [243]:

    
f.rvs([1])









    



---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-243-d05ef689c9d3> in <module>()
----> 1 f.rvs([1])

/Users/schriste/anaconda/lib/python2.7/site-packages/scipy/stats/distributions.pyc in rvs(self, *args, **kwds)
    669             return loc*ones(size, 'd')
    670 
--> 671         vals = self._rvs(*args)
    672         if self._size is not None:
    673             vals = reshape(vals, size)

<ipython-input-239-8854fe964a63> in _rvs(self, *x, **y)
      5             # don't ask me why it's using self._size
      6             # nor why I have to cast to int
----> 7             return kernel.resample(int(self._size))
      8         def _cdf(self, x):
      9             return kernel.integrate_box_1d(-numpy.Inf, x)

TypeError: int() argument must be a string or a number, not 'NoneType'



In [231]:

    
def getDistribution(data):
    kernel = stats.gaussian_kde(data)
    class rv(stats.rv_continuous):
        def _cdf(self, x):
            return kernel.integrate_box_1d(-numpy.Inf, x)
        def _pdf(self, x):
            return kernel.evaluate(x)
    return rv(name='kdedist')









    Out[231]:





array([ 0.21934769])



In [244]:

    
kernel = stats.gaussian_kde(np.random.randn(100))



In [245]:

    
kernel









    Out[245]:





<scipy.stats.kde.gaussian_kde at 0x10b0f2a90>



In [246]:

    
type(kernel)









    Out[246]:





scipy.stats.kde.gaussian_kde



In [247]:

    
kernel.resample(10)









    Out[247]:





array([[ 0.73375879, -1.45096952,  0.50167046, -0.27989526, -0.5990068 ,
        -0.35663271, -2.82007892,  0.59557902, -0.71221009,  0.40275279]])



In [282]:

    
def getDistribution():
    sigma = 1
    mu = 0
    f = lambda x: 1/(sigma * np.sqrt(2 * np.pi)) * np.exp( - (x - mu)**2 / (2 * sigma**2) )
    class rv(stats.rv_continuous):
        def _pdf(self, x):
            return f(x)
        def _cdf(self, x):
            return integrate.quad(f, -np.Inf, x)
    return rv(name='kdedist')



In [283]:

    
a = getDistribution()



In [288]:

    
%%timeit
a.rvs(size=100)









    



1 loops, best of 3: 2.05 s per loop



In [285]:

    
%pylab inline









    



Populating the interactive namespace from numpy and matplotlib






    



WARNING: pylab import has clobbered these variables: ['random', 'f']
`%matplotlib` prevents importing * from pylab and numpy



In [286]:

    
plt.figure()
plt.hist(a.rvs(size=1000))
plt.show()



In [297]:

    
integrate.quad(lambda x: x ** 5, 0, 30)









    Out[297]:





(121500000.0, 1.3489209749195652e-06)



In [293]:

    
a = lambda x: x ** 2



In [294]:

    
a(np.Inf)









    Out[294]:





inf



In [ ]:



In [ ]:

Fitting a line



In [162]:

    
from pymc import stochastic, observed, deterministic, uniform_like, runiform, rnormal, Sampler, Normal, Uniform
from numpy import inf, log, cos,array
import pymc
import numpy.random

true_slope = 1.5
true_intercept = 4
N = 30
true_x = runiform(0,50, N)
true_y = true_slope*true_x + true_intercept

data_y = rnormal(true_y, 2)
data_x = rnormal(true_x, 2)



In [163]:

    
plt.plot(true_x, true_y, '-')
plt.plot(data_x, data_y, 'o')









    Out[163]:





[<matplotlib.lines.Line2D at 0x10c97aad0>]



In [164]:

    
A = array([ data_x, np.ones(len(data_x))])
lstsq_theta = numpy.linalg.lstsq(A.T,data_y)[0]
print lstsq_theta









    



[ 1.50028281  3.9907674 ]



In [165]:

    
@stochastic
def theta(value=array([2.,5.])):
    """Slope and intercept parameters for a straight line.
    The likelihood corresponds to the prior probability of the parameters."""
    slope, intercept = value
    prob_intercept = uniform_like(intercept, -10, 10)
    prob_slope = log(1./cos(slope)**2)
    return prob_intercept+prob_slope

init_x = data_x.clip(min=0, max=50)

# Inferred true inputs.
x = Uniform('x', lower=0, upper=50, value=init_x)

@deterministic(plot=False)
def modelled_y(x=x, theta=theta):
    """Return y computed from the straight line model, given the
    inferred true inputs and the model paramters."""
    slope, intercept = theta
    return slope*x + intercept


"""
Input error model.

    Define the probability of measuring x knowing the true value.
"""
measured_input = Normal('measured_input', mu=x, tau=2, value=data_x, observed=True)

"""
Output error model.
    Define the probability of measuring x knowing the true value.
    In this case, the true value is assumed to be given by the model, but
    structural errors could be integrated to the analysis as well.
"""
y = Normal('y', mu=modelled_y, tau=2, value=data_y, observed=True)



In [166]:

    
model = pymc.Model([modelled_y, y, x, theta])

mcmc = pymc.MCMC(model)
mcmc.sample(iter=10000, burn=5000, thin=2)









    



 [-----------------100%-----------------] 10000 of 10000 complete in 1.8 sec



In [175]:

    
slope_samples = mcmc.trace('theta')[:,0]
intercept_samples = mcmc.trace('theta')[:,1]



In [176]:

    
mcmc.trace('theta').stats()









    Out[176]:





{'95% HPD interval': array([[ 1.48455922,  1.53974621],
        [ 2.80189691,  4.3501949 ]]),
 'mc error': array([ 0.00143399,  0.04088941]),
 'mean': array([ 1.51314587,  3.67836917]),
 'n': 2500,
 'quantiles': {2.5: array([ 1.48709116,  2.83088279]),
  25: array([ 1.50375163,  3.35934765]),
  50: array([ 1.512289  ,  3.65688925]),
  75: array([ 1.52336374,  4.06440852]),
  97.5: array([ 1.54953016,  4.40607657])},
 'standard deviation': array([ 0.01508375,  0.42340166])}



In [177]:

    
ax = plt.subplot(211)

plt.hist(slope_samples, histtype='stepfilled', bins=30, alpha=0.85,
         label="posterior of slope", color="#A60628", normed=True)
plt.legend(loc="upper left")
plt.xlabel("slope value")
plt.axvline(true_slope)

ax = plt.subplot(212)
plt.hist(intercept_samples, histtype='stepfilled', bins=30, alpha=0.85,
         label="posterior of intercept", color="#7A68A6", normed=True)
plt.legend(loc="upper left")
plt.xlabel("intercept value")
plt.axvline(true_intercept)

plt.show()



In [178]:

    
slope_samples.mean()









    Out[178]:





1.5131458715226436



In [179]:

    
slope_samples.std()









    Out[179]:





0.015083748072932899



In [180]:

    
intercept_samples.mean()









    Out[180]:





3.6783691715218847



In [181]:

    
intercept_samples.std()









    Out[181]:





0.42340166145651037



In [182]:

    
pymc.Matplot.plot(mcmc)









    



Plotting theta_0
Plotting theta_1



In [151]:

    
pymc_theta = .stats()["mean"]









    



---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-151-041206880cfc> in <module>()
----> 1 pymc_theta = t.stats()["mean"]

NameError: name 't' is not defined

Fitting a Gaussian



In [183]:

    
numpy.random.seed(15)

true_mu = 1.5
true_tau = 50.0
N_samples = 500

mu = pymc.Uniform('mu', lower=-100.0, upper=100.0)
tau = pymc.Gamma('tau', alpha=0.1, beta=0.001)

data = pymc.rnormal( true_mu, true_tau, size=(N_samples,) )

y = pymc.Normal('y',mu,tau,value=data,observed=True)



In [189]:

    
mcmc=pymc.MCMC([mu, tau, data, y])
mcmc.sample(iter=1000, burn=500, thin=2)









    



 [-----------------100%-----------------] 1000 of 1000 complete in 0.1 sec



In [190]:

    
print 'mu',np.mean(mcmc.trace('mu')[:])
print 'tau',np.mean(mcmc.trace('tau')[:])









    



mu 1.50080274111
tau 46.5589988381



In [188]:

    
pymc.Matplot.plot(mcmc)









    



Plotting tau
Plotting mu



In [ ]: